''' Goal: Analyze the results of high-throughput sequencing of an EATING library made from E. coli genomic DNA.

Steps: 1) Load in FASTQ data. 2) Trim out the 3' guide hairpin data and the 5' T7 promoter.

- Perhaps analyze these later
- Idea for most accurate matching: make single-item BLAST database for T7 promoter and for sgRNA hairpin. 
Get hit locations and use these to trim? Then can also get quality information on hairpin and T7.

3) From the 20mer targets:

- Check percent that map to E. coli genome
- Percent of mapped 20mers that map next to PAM cutters/at theoretical sites
    - Percent that are MBN trimmed back a base too far.
- What PAM cutters are underrepresented? 

'''


In [1]:
import os
import Bio 
import re
import timeit
import copy
import sys
from Bio import SeqIO
from Bio.Blast import NCBIXML
from Bio import Restriction 
from Bio.Restriction import *
from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio import SeqFeature
from Bio.SeqFeature import *
import pandas
import cPickle as pickle
from numpy import *
from pylab import *
from collections import Counter
import itertools
import operator

In [1]:


In [2]:
len((ecoligenome)[0].seq)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-a87b0bab2bea> in <module>()
----> 1 len((ecoligenome)[0].seq)

NameError: name 'ecoligenome' is not defined

In [3]:
# Load in the sequencing data. 
reads = []
handle1 = open("hiseqdata/Sample_ADL1_index7/ADL1_all.fastq", 'rb')
record_iter_ADL1 = SeqIO.parse(handle1, "fastq")
handle2 = open("hiseqdata/Sample_ADL_2_index8/ADL2_all.fastq", 'rb')
record_iter_ADL2 = SeqIO.parse(handle2, "fastq")

In [4]:
# a generator that yields items instead of returning a list
def firstn(n, a):
    num = 0
    while num < n:
        yield a.next()
        num += 1

#sum_of_first_n = sum(firstn(1000000))

In [4]:


In [8]:
sgRNAfiltlib5pr_ADL1 = firstn(100, record_iter_ADL1)
sgRNAfiltlib5pr_ADL2 = firstn(40000000, record_iter_ADL2)

In [6]:
# Define the filter that delineates the end of a target 20mer. This uses the first 9nt of the sgRNA hairpin. 
sgRNAconst = SeqRecord(Seq("GTTTAAGAG"))

# Makes a Feature for each of the marker 9nt sgRNA signatures in the FASTQ sequencing files.
def spacersonly(seqs):
    while True:
        seqrecord = seqs.next()
        #for seqrecord in seqs:
        fwdlocs = []  
        revlocs = []  
        fwdlocs = [tloc.start() for tloc in re.finditer(str(sgRNAconst.seq), str(seqrecord.seq))]
        for item in fwdlocs:
            start = ExactPosition(int(item) + 1)
            end = ExactPosition(int((item) + len(sgRNAconst)))
            location = FeatureLocation(start, end)
            feature = SeqFeature(location,type="sgRNAconst", strand = +1)
            seqrecord.features.append(feature)
        revlocs = [tloc.start() for tloc in re.finditer(str(sgRNAconst.reverse_complement().seq), str(seqrecord.seq))]
        for item in revlocs:
            start = ExactPosition(int(item) + 1)
            end = ExactPosition(start + len(sgRNAconst) - 1)
            location = FeatureLocation(start, end)
            feature = SeqFeature(location,type="sgRNAconst", strand = -1)
            seqrecord.features.append(feature)
        for feat in seqrecord.features:
            if feat.strand == 1:
                tgtstart = int(feat.location.start) - 36 # -21
                tgtend = int(feat.location.start) - 1
                sgtgt = seqrecord[tgtstart:tgtend]
                #yield sgtgt
                #alltgts.append(sgtgt)
                #print "pos \n \n"
            if feat.strand == -1:
                tgtend = int(feat.location.end) + 36 # +21
                tgtstart = int(feat.location.end)
                sgtgt = seqrecord[tgtstart:tgtend].reverse_complement()
                sgtgt.name=seqrecord.name
                #yield sgtgt
                #alltgts.append(sgtgt)
            bad = 0
            try:
                l = [tloc.end() for tloc in re.finditer("ACTCACTATAG", str(sgtgt.seq))]
                sgtgt = sgtgt[int(l[0]):]
            except:
                None
            for score in sgtgt.letter_annotations["phred_quality"]:
                if score < 30:
                    bad = 1
            if bad == 0 and len(sgtgt) > 10:
                yield sgtgt
                break
            #except:
                #yield None
            #sys.stdout.flush() # These two lines produce the line counter as the loop runs
            #sys.stdout.write('\r' + str(index) + " "),

First goal: Plot out read abundance vs unique read


In [10]:
#b = spacersonly(sgRNAfiltlib5pr_ADL1)
# c = spacersonly(sgRNAfiltlib5pr_ADL2)
b = spacersonly(itertools.chain(record_iter_ADL1, record_iter_ADL2))

In [ ]:
alltgts = []
for index, item in enumerate(b):
    alltgts.append(item)
    if str(index)[-4:] == "0000": #only print every 10000
        sys.stdout.flush() # These two lines produce the line counter as the loop runs
        sys.stdout.write('\r' + str(index) + " "),

In [12]:
# pickle.dump(alltgts, open("alltgts for ADL1 and ADL2 product of spacersonly.pkl", "wb"))

In [2]:
alltgts = pickle.load(open("alltgts for ADL1 and ADL2 product of spacersonly.pkl", "rb"))


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-2-5dd1bc9d8d58> in <module>()
----> 1 alltgts = pickle.load(open("alltgts for ADL1 and ADL2 product of spacersonly.pkl", "rb"))

/Users/andylane/anaconda/python.app/Contents/lib/python2.7/copy_reg.pyc in _reconstructor(cls, base, state)
     50         obj = base.__new__(cls, state)
     51         if base.__init__ != object.__init__:
---> 52             base.__init__(obj, state)
     53     return obj
     54 

KeyboardInterrupt: (None, <function _reconstructor at 0x100517a28>, (<class 'Bio.SeqRecord._RestrictedDict'>, <type 'dict'>, {'phred_quality': [41, 37, 39, 40, 40, 41, 41, 40, 41, 40, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41]}))

In [ ]:
# Bio.SeqIO.write(alltgts, "HiSeqOut/alltgts_ADL2.fastq", "fastq")

In [2]:
import shelve
import time

In [3]:
alltgts = shelve.open("alltgtsshelf", protocol=2)

In [13]:
# hh = itertools.chain(SeqIO.parse(open("HiSeqOut/alltgts_ADL2.fastq", 'rb'), "fastq"),\
#                      SeqIO.parse(open("HiSeqOut/alltgts_ADL1.fastq", 'rb'), "fastq"))
# i = 0
# for item in hh:
#     alltgts[item.id] = str(item.seq)
#     i = i+1
#     if i == 1000:
#         alltgts.sync()
#         i = 0

In [61]:
item


Out[61]:
SeqRecord(seq=Seq('CCATAATGCGGGCGGTTGCC', SingleLetterAlphabet()), id='HS1:391:C5CWBACXX:2:1103:2123:57903', name='HS1:391:C5CWBACXX:2:1103:2123:57903', description='HS1:391:C5CWBACXX:2:1103:2123:57903 1:N:0:ACTTGA', dbxrefs=[])

In [5]:
size(alltgts)


Out[5]:
1

This part makes a dict of the reads list, to generate a unique-reads list


In [5]:
tgtsdict = {}
for item in alltgts.values():
    tgtsdict[str(item.seq)] = item

#print len(alltgts)
print len(tgtsdict)


191583

In [6]:
#tgtsdict

Worth plotting the raw spacer distribution and the biologically-relevant spacer distribution (i.e. first 20mers)


In [7]:
# Trim alltgts to 20nt; see number of unique spacers:
# This trims non-unique reads list to 
tgts20 = []
for item in alltgts:
    t = item[:20]
    tgts20.append(t)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-7-036a1661aff4> in <module>()
      3 tgts20 = []
      4 for item in alltgts:
----> 5     t = item[:20]
      6     tgts20.append(t)

/Users/andylane/anaconda/lib/python2.7/site-packages/Bio/SeqRecord.pyc in __getitem__(self, index)
    461             #(this should also work with strides, even negative strides):
    462             for key, value in self.letter_annotations.items():
--> 463                 answer._per_letter_annotations[key] = value[index]
    464 
    465             return answer

KeyboardInterrupt: 

In [137]:
with open('alltgts.csv', 'wb') as tgtswriter:
    a = csv.writer(tgtswriter, delimiter=',')
    for item in alltgts.items():
        a.writerow((item[1],))

In [8]:
# This part makes a dictionary of the first 20mers of each read
tgts20dict = {}
for item in tgts20:
    tgts20dict[str(item.seq)] = item

In [9]:
len(tgts20)


Out[9]:
4169111

In [10]:
len(tgts20dict)


Out[10]:
163525

In [37]:
stringsonly = (str(item[-20:]) for item in alltgts.values() if len(item) > 10)

In [38]:
c = Counter(stringsonly)

In [ ]:
import operator
sorted_c = sorted(c.items(), key=operator.itemgetter(1))

In [ ]:
x = xrange(0, len(sorted_c))

In [ ]:
counts = []
for item, count in sorted_c:
    counts.append(count)

In [ ]:
len(x)

In [ ]:
import pylab
pylab.plot(x, counts)
pylab.ylim(0, 5000)

In [ ]:
counts[-2]

In [ ]:
bins = []
for b in xrange(2, 48):
    if 2**b < 100000:
        bins.append(2**b)

In [ ]:
% pylab inline
figure()
hist(counts, bins)
gca().set_xscale("log")

This indicates that only a small percentage of unique spacers are present in many copies (>100 copies). However, if that small number of overrepresented spacers makes a large percentage of the library, this could be problematic. Thus we decided to plot the abundance of reads as a total percentage of the library.


In [ ]:
# Percentile-bin the data to show what percent of the library is made up of overrepresented species. This has direct relevance
# to using it in screens. In other words, what percent of the library does the most abundant 0.1% unique reads produce?

In [16]:
stringsonly = (str(item[-20:]) for item in alltgts.values() if len(item) > 10)

In [17]:
sys.path.append("../VirtualEating/")
import al_funcs

In [18]:
%pylab inline
al_funcs.al_plot_complexity(stringsonly)
#savefig('HiSeqOut/Unique Reads vs Percent of library ADL1.pdf', bbox_inches='tight')


Populating the interactive namespace from numpy and matplotlib

In [20]:
import random

In [117]:
rc = dict(random.sample(c, 10000))


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-117-799baeacaded> in <module>()
----> 1 rc = dict(random.sample(c, 10000))

AttributeError: 'builtin_function_or_method' object has no attribute 'sample'

In [124]:
rc = dict(c)

In [34]:
import pandas as pd
from ggplot import *

In [133]:
df = pd.DataFrame({
    "counts": [item[1] for item in rc.items() if item[1] >1],
})
df = pd.melt(df)

In [146]:
ggplot(aes(x='value', color='variable'), data=df) + \
    geom_histogram(binwidth = 2)


Out[146]:
<ggplot: (303110821)>

In [152]:
len(alltgts)


Out[152]:
4165908

In [153]:
4165908 /125230


Out[153]:
33

In [6]:
del freqs


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-832ee327e4ef> in <module>()
----> 1 del freqs

NameError: name 'freqs' is not defined

In [8]:
math.log(2,2)


Out[8]:
1.0

In [18]:
counts[-50:]


Out[18]:
[10.159871336778389,
 10.17492568250068,
 10.216745858195306,
 10.27262978497637,
 10.293471648838134,
 10.321928094887362,
 10.352043425795433,
 10.35974956032233,
 10.382624026574916,
 10.383704292474052,
 10.397674632948268,
 10.417852514885897,
 10.46352437327118,
 10.46454575033394,
 10.492854620174748,
 10.515699838284043,
 10.521600439723727,
 10.52845411076479,
 10.530406337099068,
 10.54978466794786,
 10.595257481449035,
 10.61470984411521,
 10.61470984411521,
 10.615629636968098,
 10.676838606474155,
 10.744833837499547,
 10.758223214726724,
 10.786269627648466,
 10.824163209744158,
 10.874212934830874,
 10.89178370321831,
 10.917372079476841,
 10.95274124718649,
 11.03754695396218,
 11.113090983442381,
 11.172427508645484,
 11.179287104646004,
 11.194141238863136,
 11.234817431117325,
 11.285402218862249,
 11.354800344350597,
 11.459431618637298,
 11.52845411076479,
 11.655530723155643,
 11.775610280759539,
 12.33147682185962,
 12.66177809777199,
 12.987796929042542,
 13.909330593006835,
 17.042022688250174]

In [36]:
c


Out[36]:
Counter()

In [40]:
dimport math
mean_guide_copy = len(alltgts) / float(len(c.items()))
import operator
sorted_c = sorted(c.items(), key=operator.itemgetter(1))
counts = []
for item, count in sorted_c:
    if count >1:
#       counts.append(count/mean_guide_copy)
        counts.append(math.log(float(count),2))

# bins = []
# for b in xrange(0, 48):
#     if (2**b)/100.0 < 300000:
#         bins.append((2**b)/100.0)
bins=[x/2.0 for x in (xrange(0, 50, 1))]
% pylab inline
figure()
#gca().set_xscale("log")
# xticks(bins[:], [str(item)[0:5] for item in bins[:]], rotation = -90)
hist(counts, bins[:], color = "gray")
xlabel("log2 read count")
ylabel("Count of unique guides")
savefig('HiSeqOut/Hist for all guides- bothreps.pdf', bbox_inches='tight')


Populating the interactive namespace from numpy and matplotlib

In [ ]:


In [114]:
import csv
with open('gcvsabundance.csv', 'wb') as gcwriter:
    a = csv.writer(gcwriter, delimiter=',')
    for item in gcs:
        a.writerow((item[0], item[1], item[2]))

In [33]:
gcs = []
for item, count in c:
    g = float(item.count("G"))
    cn = float(item.count("C"))
    a = float(item.count("A"))
    t = float(item.count("T"))
    gc = (g+cn)*100.0/(g+cn+a+t)
    gcs.append((item, count, gc))

In [ ]:
plot

In [50]:
from ggplot import *

In [51]:
gcsdf = pandas.DataFrame(data=gcs)

In [ ]:
bins = np.logspace(gcsdf[1].min()+1, gcsdf[1].max(), base=2, dtype=list)

In [ ]:
p = ggplot(aes(x=gcsdf[2], y=gcsdf[1]), data=gcsdf)
p+ geom_jitter(alpha=0.05, position="jitter") + scale_y_log10()

Closer analysis revealed that the MmeI enzyme used in trimming genome fragments to 20nt produced a mixture of 20/21nt spacers. We quantified this for all reads (not just unique reads):


In [10]:
lengths = []
for item in alltgts:
    lengths.append(len(item))

In [11]:
list(xrange(0, 10, 1))


Out[11]:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [6]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import numpy

# This sets my pyplot font to Arial
# http://stackoverflow.com/questions/16574898/how-to-load-ttf-file-in-matplotlib-using-mpl-rcparams?lq=1
path = '/Library/Fonts/Microsoft/Arial.ttf'
prop = fm.FontProperties(fname=path)
mpl.rcParams['font.family'] = prop.get_name()

In [25]:
Counter(lengths)


Out[25]:
Counter({21: 2196483, 20: 1880968, 35: 46417, 22: 23942, 19: 10893, 11: 1784, 23: 1666, 24: 1144, 12: 1139, 18: 1094, 14: 875, 13: 792, 17: 753, 16: 738, 15: 390, 25: 9, 28: 8, 26: 5, 27: 4, 31: 3, 29: 2, 36: 2})

In [30]:
%pylab inline
figure(figsize=(10, 4))
bins = list(xrange(19,36,1))
hist(lengths, bins, color ="gray")
title("MmeI cut distance")
xlabel("Spacer length")
ylabel("Frequency")
#plt.yscale('log')
xticks(list(xrange(19,36,1)))
#plt.yticks( [0, 20, 40,60,80],  [0, 20, 40,60,80] )
#plt.ylim([0,100])
tick_params(axis=u'both', labelsize=18)
savefig('HiSeqOut/MmeI cut distance bothreps.pdf', bbox_inches='tight')


Populating the interactive namespace from numpy and matplotlib

In [68]:
h = open("HiSeqOut/alltgts_ADL2.fastq", "rb")
conv = SeqIO.parse(h, "fastq")

In [69]:
SeqIO.write(conv, "HiSeqOut/alltgts_ADL2.fasta", "fasta")


Out[69]:
1906397

In [1]:
#Made a BLAST database from E. coli genome

# To match against the primer BLAST database, which of the mappable 20mers are basically fragment ends?
# First, write out a query fasta containing the mappable targets:
#Bio.SeqIO.write(alltgts, "alltgts.temp", "fasta")

blastn_cline = NcbiblastnCommandline(query="HiSeqOut/alltgts_ADL2.fasta", db="ecolik12", \
task = "blastn-short",outfmt=5, out="HiSeqOut/alltgts_ADL2_more_stringent_eval.blast", max_target_seqs=10, num_threads = 7, evalue = 0.0001, export_search_strategy="alltgts_ADL2_more_stringent_eval.strategy")
#timeit.timeit(blastn_cline, number =1)
blastn_cline()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-6fa6e56c8580> in <module>()
      5 #Bio.SeqIO.write(alltgts, "alltgts.temp", "fasta")
      6 
----> 7 blastn_cline = NcbiblastnCommandline(query="HiSeqOut/alltgts_ADL2.fasta", db="ecolik12", task = "blastn-short",outfmt=5, out="HiSeqOut/alltgts_ADL2_more_stringent_eval.blast", max_target_seqs=10, num_threads = 7, evalue = 0.0001, export_search_strategy="alltgts_ADL2_more_stringent_eval.strategy")
      8 #timeit.timeit(blastn_cline, number =1)
      9 blastn_cline()

NameError: name 'NcbiblastnCommandline' is not defined

E value on first run was 0.0001, max_target_seqs was 100. Change to less stringent (0.005)

This part bins the hits into 4 categories:

a) Hits in E. coli genome, also hits in predicted cutters (i.e. good)
b) Hits in E. coli genome, not in predicted cutters (MBN errors)
c) Short, ambiguous spacers
d) Longer spacers, but not in E. coli genome

a) Hits in E. coli genome, also hits in predicted cutters (i.e. good) and

b) Hits in E. coli genome that aren't predicted cutters (i.e. probably MBN failures)


In [49]:
%run al_funcs.ipynb

In [47]:
handle = open("../../Genomic Data/Escherichia coli str. K-12 substr. MG1655, complete genome.fasta", 'rb')
ecoligenome = [SeqIO.read(handle, "fasta", alphabet=IUPACAmbiguousDNA())]

In [50]:
cutslist = [item for item in al_digesttarget(ecoligenome)]

In [51]:
cutslist[0]


Out[51]:
SeqRecord(seq=Seq('GGGACTCGCCGCCGCCCAGC', IUPACAmbiguousDNA()), id='0_F', name='564', description='HpaII', dbxrefs=['gi|545778205|gb|U00096.3|'])

In [20]:
Bio.SeqIO.write(cutslist, "ecoligenomeguides.gb", "gb")


Out[20]:
92450

In [53]:
# This gives the number of unique guides expected from the E. coli genome
len(set([str(item.seq) for item in cutslist]))


Out[53]:
80894

In [21]:
cutslistdict = {}
for item in cutslist:
    cutslistdict[str(item.seq)] = item.description

In [22]:
cutslistdict['ATGGCGCGCACTACCGCCAC']


Out[22]:
'HpaII'

In [23]:
# Load in the BLAST results
result_handle = open("HiSeqOut/alltgts_ADL2.blast")
ecoli_hits_blast = NCBIXML.parse(result_handle) # use NCBIXML.parse(result_handle) for multiple queries here

In [24]:
item = next(ecoli_hits_blast)

In [26]:
item.alignments[0].hsps[0].query[0:20]


Out[26]:
u'CGAGCAGGGTTTTCAACGCA'

In [39]:
a_goodguides = []
b_MBNerrors = []
c_15match_guides = []
whittlinglist = []

# IF: a blast entry has a score of >20 and is in the theoretical guides list, put into a_guideslist.
# ELSEIF: a blast entry has a score of >20 but isn't in the guides list, put into b_MBNerrors.
    
# Otherwise, put into a list to be dealt with and meted out later.
index = 0

for item in range(100000):
    try:
        item = next(ecoli_hits_blast)
    except:
        break
    index += 1
    try:
        if item.alignments[0].hsps[0].score > 19:
            try: 
                a_goodguides.append((cutslistdict[item.alignments[0].hsps[0].query[0:20]], item.alignments[0].hsps[0].query[0:20]))
            except:
                b_MBNerrors.append(item)
        elif item.alignments[0].hsps[0].score > 14:
            c_15match_guides.append(item)
        else:
            whittlinglist.append(item)
    except:
        whittlinglist.append(item)
    if str(index)[-5:] == "00000": #only print every 100000
        sys.stdout.flush() # These two lines produce the line counter as the loop runs
        sys.stdout.write('\r' + str(index) + " "),


100000 

In [25]:
#def carveup(item):

In [26]:
index


Out[26]:
1906397

In [27]:
len(a_goodguides)


Out[27]:
885983

In [28]:
len(b_MBNerrors)


Out[28]:
900331

In [29]:
len(c_15match_guides)


Out[29]:
0

In [30]:
len(whittlinglist)


Out[30]:
120083

In [31]:
pickle.dump(a_goodguides, open( "HiSeqOut/a_goodguides_ADL2_more_stringent.p", "wb" ))
pickle.dump(b_MBNerrors, open( "HiSeqOut/b_MBNerrors_ADL2_more_stringent.p", "wb" ))
pickle.dump(c_15match_guides, open( "HiSeqOut/c_15match_guides_ADL2_more_stringent.p", "wb" ))
pickle.dump(whittlinglist, open( "HiSeqOut/whittlinglist_ADL2_more_stringent.p", "wb" ))

#favorite_color = pickle.load( open( "save.p", "rb" ) )
#del a_goodguides
#del b_MBNerrors

In [35]:
a_goodguides = pickle.load( open( "HiSeqOut/a_goodguides_ADL2_more_stringent.p", "r"))

In [39]:
a_goodguides_1 = pickle.load( open( "HiSeqOut/a_goodguides_ADL1_more_stringent.p", "r"))

In [42]:
print len(a_goodguides) # This is the number of good guides in ADL2
print len(a_goodguides_1) # This is the number of good guides in ADL1
print len(set(a_goodguides))
print len(set(a_goodguides_1))


885983
1003280
37854
38774

In [ ]:
)

In [43]:
a_goodguides_bothreps = a_goodguides
a_goodguides_bothreps.extend(a_goodguides_1)

In [44]:
len(set(a_goodguides_bothreps))


Out[44]:
41638

In [6]:
b_MBNerrors = pickle.load( open( "HiSeqOut/b_MBNerrors_ADL2_more_stringent.p", "r"))

In [7]:
b_MBNerrors_1 = pickle.load( open( "HiSeqOut/b_MBNerrors_Ab_MBNerrors_more_stringent.p", "r"))

In [9]:
len(b_MBNerrors_1)


Out[9]:
1037184

In [25]:
b_MBNerrors[11].query_letters


Out[25]:
20

In [28]:
from collections import Counter

In [38]:
Counter([len(sequence) for enz, sequence in a_goodguides[:1000]])


Out[38]:
Counter({20: 1000})

In [34]:
Counter([l.query_length for l in b_MBNerrors_1[0:200000]])


Out[34]:
Counter({21: 187416, 20: 5887, 35: 4905, 22: 1659, 24: 66, 23: 65, 27: 2})

In [56]:
b_MBNerrors_bothreps = b_MBNerrors
b_MBNerrors_bothreps.extend(b_MBNerrors_1)

In [57]:
len(set([item.alignments[0].hsps[0].query for item in b_MBNerrors_bothreps]))


Out[57]:
83823

In [58]:
len(set([item.alignments[0].hsps[0].query[0:20] for item in b_MBNerrors_bothreps]))


Out[58]:
67378

In [ ]:


In [ ]:

Result:

Experiment E value Category Count
ADL1 0.0001? Total 2,262,713
A: good 1,003,277
B: MBN error 1,037,187
Others? 222,250
ADL1 0.005? Total 2,262,713
A: good 1,003,277
B: MBN error 1,037,187
C: 15-matches 22,370
Others? 199,880
ADL2 0.005? Total 1906396
A: good 885981
B: MBN error 900333
C: 15-matches 19354
Others? 199,880

In [40]:
handle = open("HiSeqOut/alltgts_ADL1.fastq", 'rb')
ADL1 = SeqIO.parse(handle, "fastq")

In [41]:
#next(ADL1)

In [42]:
ADL1_dict = {}
for item in ADL1:
    ADL1_dict[str(item.description)] = item

What's in the "remainders" list (short reads, other organisms' reads...)


In [43]:
whittlinglist_with_data = []

for item in whittlinglist:
    whittlinglist_with_data.append((ADL1_dict[item.query], item))

In [44]:
lengths = []
for item, blast in whittlinglist_with_data:
    lengths.append(len(item))

In [45]:
%pylab inline
figure()
bins = list(xrange(0,37,1))
hist(lengths, bins)
title("MmeI cut distance")
xlabel("Spacer length")
ylabel("Frequency")
#plt.yscale('log')
#xticks([18, 19, 20,21,22, 23, 24, 25, 26])
#plt.yticks( [0, 20, 40,60,80],  [0, 20, 40,60,80] )
#plt.ylim([0,100])
savefig('HiSeqOut/MmeI cut distance for non E coli ADL1.pdf', bbox_inches='tight')


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['power', 'uniform', 'shuffle', 'sample', 'seed', 'randint', 'linalg', 'draw_if_interactive', 'random', 'fft', 'triangular', 'info']
`%matplotlib` prevents importing * from pylab and numpy

In [ ]:
#### The spacer length in the remainders list looks pretty normal. What about the BLAST properties? How many are just 1-2 nt mismatches?

In [46]:
nearmatch = 0
noalignments = 0

for item in whittlinglist_with_data:
    try:
        if item[1].alignments[0].hsps[0].score > 15:
            nearmatch += 1
    except:
        noalignments += 1

In [47]:
print(nearmatch)


0

In [48]:
print(noalignments)


199880

In [49]:
hitlengths = []
for item, blast in whittlinglist_with_data:
    hitlengths.append(len(blast.alignments))

In [50]:
len(whittlinglist_with_data)


Out[50]:
199880

In [40]:
stringsonly = []
for cut, item in a_goodguides:
    stringsonly.append(str(item[0:20]))

In [41]:
al_plot_complexity(stringsonly)
savefig('HiSeqOut/Unique Reads vs Percent of library ADL2_ForGoodGuidesOnly_more_stringent_eval_100000_only.pdf', bbox_inches='tight')


Populating the interactive namespace from numpy and matplotlib

In [34]:
stringsonly = []
for item in b_MBNerrors:
    stringsonly.append(str(item.alignments[0].hsps[0].query[0:20]))
al_plot_complexity(stringsonly)
savefig('HiSeqOut/Unique Reads vs Percent of library ADL2_b_MBNerrors_more_stringent_eval.pdf', bbox_inches='tight')


Populating the interactive namespace from numpy and matplotlib

In [62]:
item


Out[62]:
<Bio.Blast.Record.Blast at 0x3bf90a710>

In [20]:
stringsonly = []
for item in c_15match_guides:
    stringsonly.append(str(item.alignments[0].hsps[0].query[0:20]))
al_plot_complexity(stringsonly)
savefig('HiSeqOut/Unique Reads vs Percent of library ADL2_c_15match_guides_more_stringent_eval.pdf', bbox_inches='tight')


Populating the interactive namespace from numpy and matplotlib

In [ ]:
whittlinglist[0]


Out[ ]:
<Bio.Blast.Record.Blast at 0x12be5a190>

In [21]:
stringsonly = []
for cut, item in whittlinglist:
    stringsonly.append(str(item.alignments[0].hsps[0].query[0:20]))
al_plot_complexity(stringsonly)
savefig('HiSeqOut/Unique Reads vs Percent of library ADL1_whittlinglist_more_stringent_eval.pdf', bbox_inches='tight')


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-21-1fb550182b7f> in <module>()
      1 stringsonly = []
----> 2 for cut, item in whittlinglist:
      3     stringsonly.append(str(item.alignments[0].hsps[0].query[0:20]))
      4 al_plot_complexity(stringsonly)
      5 savefig('HiSeqOut/Unique Reads vs Percent of library ADL1_whittlinglist_more_stringent_eval.pdf', bbox_inches='tight')

TypeError: 'Blast' object is not iterable

Next to the total "spacer" list, the good guides are somewhat more linearly distributed with respect to their representationin the guides list.

Go back and plot a histogram of representation for the good guides only:


In [18]:
a_goodguides_ADL2 = pickle.load( open( "HiSeqOut/a_goodguides_ADL2_more_stringent.p", "rb" ) )

In [11]:
a_goodguides_ADL1 = pickle.load( open( "HiSeqOut/a_goodguides_ADL1_more_stringent.p", "rb" ) )

In [19]:
len(a_goodguides_ADL2)


Out[19]:
885983

In [59]:
alltgts = a_goodguides_bothreps

In [60]:
alltgts[0]


Out[60]:
('HpaII', u'ATAGCGGTTAAAACCTTCAC')

In [60]:


In [61]:
# Trim alltgts to 20nt; see number of unique spacers:
# This trims non-unique reads list to 
tgts20 = []
for item in alltgts:
    t = item[1][:20]
    tgts20.append(t)
from collections import Counter
c = Counter(tgts20)

In [62]:
len(c)


Out[62]:
41638

In [66]:
freqs = []
mean_guide_copy = len(a_goodguides_bothreps) / float(len(c))
for item in c:
   freqs.append((c[item], c))
import operator
sorted_c = sorted(c.items(), key=operator.itemgetter(1))
counts = []
for item, count in sorted_c:
    counts.append(count/mean_guide_copy)
bins = []
for b in xrange(0, 48):
    if (2**b)/100.0 < 3000:
        bins.append((2**b)/100.0)
bins = [0.015625,0.03125, 0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8,16,32,64,128,256]
#for b in xrange(1, 48, 1):
    #if (1.2**b)/100 < 150:
        #bins.append((1.2**b)/100)

In [ ]:


In [67]:
% pylab inline
figure()
gca().set_xscale("log")
#gca().set_yscale("log")
xticks(bins[:], [str(item)[0:5] for item in bins[:]], rotation = -90)
hist(counts, bins[:], color = "gray")
savefig('HiSeqOut/Good Guides Count for unique instances bothreps LogLog (more_stringent_eval).pdf', bbox_inches='tight')


Populating the interactive namespace from numpy and matplotlib

In [128]:
sorted_c[0]


Out[128]:
(u'GCCAATACGGTCGAGAAAAC', 1)

In [119]:
bins


Out[119]:
[0.01,
 0.02,
 0.04,
 0.08,
 0.16,
 0.32,
 0.64,
 1.28,
 2.56,
 5.12,
 10.24,
 20.48,
 40.96,
 81.92,
 163.84,
 327.68,
 655.36,
 1310.72,
 2621.44]

In [43]:
min(counts)


Out[43]:
0.04272542475419957

In [25]:
# This gives the mean copies of guides (total good reads divided by total unique reads)
mean_guide_copy = len(a_goodguides_ADL2) / float(len(c))

In [26]:
# This gives the standard deviation of copies of guides
std_guide_copy = std([b for a, b in c.items()])

In [27]:
# The percent of guides within 2 standard deviations
float(len([count for seq, count in c.items() if count < (mean_guide_copy + 2*std_guide_copy)])) / len(c) * 100


Out[27]:
96.05061552279813

In [28]:
len(c.items())


Out[28]:
37854

In [11]:
hist(counts, bins)


Out[11]:
(array([  0.00000000e+00,   5.07900000e+03,   5.35200000e+03,
          5.93300000e+03,   6.57100000e+03,   6.50000000e+03,
          4.96200000e+03,   2.55800000e+03,   7.81000000e+02,
          1.03000000e+02,   1.20000000e+01,   3.00000000e+00]),
 array([   0,    1,    2,    4,    8,   16,   32,   64,  128,  256,  512,
        1024, 2048]),
 <a list of 12 Patch objects>)

In [12]:
sorted_c[0]


Out[12]:
(u'GCCAATACGGTCGAGAAAAC', 1)

In [13]:
from pandas import *

In [14]:
df = DataFrame(sorted_c)

In [15]:
import csv
with open('sorted_c.csv', 'wb') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for item in sorted_c:
        spamwriter.writerow(item)

In [32]:
df


Out[32]:
0 1
0 GCCAATACGGTCGAGAAAAC 1
1 GATGTAATAATCCAGTTGCC 1
2 AGGCTTGATCGAGATACCGC 1
3 GTAGCGACCGATACATTACC 1
4 AATGTTATTTGCTACTGTCC 1
5 CATCGCAGCCCTGAAAACGC 1
6 CGTGACTGAACTGATGATGC 1
7 TTCGCGCCGGCGTGTACACC 1
8 GTTATCAAGAACATAGATAC 1
9 TAGAGATCATAATGATCGCC 1
10 TGCGGCTGGTGCGCTTCATC 1
11 TGGCTTCTTGTTTGGCGCTC 1
12 AGCTCGAGAAAATAGCGATC 1
13 TAGCTTTGATGAACAAGGTC 1
14 GCTGGCGAAAACGAGATTGC 1
15 TAATGATGAAAGATTTCTCC 1
16 ACTGGAGAATAAAATTTATC 1
17 CGCTGAGATAGAACGCTGGC 1
18 TTTTGTGGCAAATGGGATGC 1
19 ATCAATCGCCAGCAGTGAGC 1
20 GGCAAGAGAGCATCGAAACC 1
21 CAGCAACTGGTTGCAGCTTC 1
22 GGCGCACCTGCCATATCTCC 1
23 TATTTTCAGGATCTGATGCC 1
24 TAATGCGCACCGTAGAGGTC 1
25 CCAAAAGTATTGCATTGTGC 1
26 GTCCGAGCATTTTCACCAGC 1
27 GTCCGGCACATCACGCCAGC 1
28 ATCAGCCAGTTAAACAGGTC 1
29 AAGCATAATCAGCACTTCAC 1
... ... ...
37824 GTGGCGGCAACAAAGCGATC 403
37825 GCGATGGCGGTGTTGATCAC 404
37826 CTGGGCACGGTAGGACGCAC 404
37827 GCAGCCGAGCGCGGAAACAC 413
37828 CCATAATGCGGGCGGTTGCC 414
37829 GGGAAAGGCGAAAAGAACCC 414
37830 CCATAATGCGGGCTGTTGCC 437
37831 GCGCCAAGCGAAAAGCAGAC 443
37832 ATCCGCACCAACGCGCAGCC 449
37833 CCGAAACAGTGCTCTACCCC 451
37834 GCGTAACGCGGATCGTGACC 464
37835 CCGCCATCACCCACAGTGCC 471
37836 GCCAGCAGCGAGCCGTTGCC 477
37837 GCCGCCGCCCCCTGCACCCC 509
37838 GCAGGCCAGCAGATCGGCCC 510
37839 GGTGTCGCCACGCCGCTGCC 514
37840 ACCACGCACCAGACCCATAC 520
37841 GCAATCCTCGCCAGTGAACC 535
37842 CTGGAAGCGGCGGCGCTGAC 539
37843 GCCGAACGCACCGTCCATAC 599
37844 ACCAGACACTGTCCGCAACC 629
37845 GCGTCCGTTGCAGGATCGAC 632
37846 CCGTGTGCGCGGTAGCTGAC 695
37847 ACCCCAAGGCTGGTAATGGC 702
37848 GCGCTCTGCTGGAAGTGCAC 728
37849 GTGCACCGACGCCGCGCCCC 741
37850 GCGCATTCTGGCGCATGATC 805
37851 GTTACCCGCAGAAGAAGCAC 1054
37852 GCAACTGCGCACGCGACACC 1374
37853 ACCAGCCGCTGCGGGCCACC 1474

37854 rows × 2 columns


In [32]:
df


Out[32]:
0 1
0 GCCAATACGGTCGAGAAAAC 1
1 GATGTAATAATCCAGTTGCC 1
2 AGGCTTGATCGAGATACCGC 1
3 GTAGCGACCGATACATTACC 1
4 AATGTTATTTGCTACTGTCC 1
5 CATCGCAGCCCTGAAAACGC 1
6 CGTGACTGAACTGATGATGC 1
7 TTCGCGCCGGCGTGTACACC 1
8 GTTATCAAGAACATAGATAC 1
9 TAGAGATCATAATGATCGCC 1
10 TGCGGCTGGTGCGCTTCATC 1
11 TGGCTTCTTGTTTGGCGCTC 1
12 AGCTCGAGAAAATAGCGATC 1
13 TAGCTTTGATGAACAAGGTC 1
14 GCTGGCGAAAACGAGATTGC 1
15 TAATGATGAAAGATTTCTCC 1
16 ACTGGAGAATAAAATTTATC 1
17 CGCTGAGATAGAACGCTGGC 1
18 TTTTGTGGCAAATGGGATGC 1
19 ATCAATCGCCAGCAGTGAGC 1
20 GGCAAGAGAGCATCGAAACC 1
21 CAGCAACTGGTTGCAGCTTC 1
22 GGCGCACCTGCCATATCTCC 1
23 TATTTTCAGGATCTGATGCC 1
24 TAATGCGCACCGTAGAGGTC 1
25 CCAAAAGTATTGCATTGTGC 1
26 GTCCGAGCATTTTCACCAGC 1
27 GTCCGGCACATCACGCCAGC 1
28 ATCAGCCAGTTAAACAGGTC 1
29 AAGCATAATCAGCACTTCAC 1
... ... ...
37824 GTGGCGGCAACAAAGCGATC 403
37825 GCGATGGCGGTGTTGATCAC 404
37826 CTGGGCACGGTAGGACGCAC 404
37827 GCAGCCGAGCGCGGAAACAC 413
37828 CCATAATGCGGGCGGTTGCC 414
37829 GGGAAAGGCGAAAAGAACCC 414
37830 CCATAATGCGGGCTGTTGCC 437
37831 GCGCCAAGCGAAAAGCAGAC 443
37832 ATCCGCACCAACGCGCAGCC 449
37833 CCGAAACAGTGCTCTACCCC 451
37834 GCGTAACGCGGATCGTGACC 464
37835 CCGCCATCACCCACAGTGCC 471
37836 GCCAGCAGCGAGCCGTTGCC 477
37837 GCCGCCGCCCCCTGCACCCC 509
37838 GCAGGCCAGCAGATCGGCCC 510
37839 GGTGTCGCCACGCCGCTGCC 514
37840 ACCACGCACCAGACCCATAC 520
37841 GCAATCCTCGCCAGTGAACC 535
37842 CTGGAAGCGGCGGCGCTGAC 539
37843 GCCGAACGCACCGTCCATAC 599
37844 ACCAGACACTGTCCGCAACC 629
37845 GCGTCCGTTGCAGGATCGAC 632
37846 CCGTGTGCGCGGTAGCTGAC 695
37847 ACCCCAAGGCTGGTAATGGC 702
37848 GCGCTCTGCTGGAAGTGCAC 728
37849 GTGCACCGACGCCGCGCCCC 741
37850 GCGCATTCTGGCGCATGATC 805
37851 GTTACCCGCAGAAGAAGCAC 1054
37852 GCAACTGCGCACGCGACACC 1374
37853 ACCAGCCGCTGCGGGCCACC 1474

37854 rows × 2 columns


In [24]:
from ggplot import *

In [42]:
df[1]


Out[42]:
0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
...
37839     514
37840     520
37841     535
37842     539
37843     599
37844     629
37845     632
37846     695
37847     702
37848     728
37849     741
37850     805
37851    1054
37852    1374
37853    1474
Name: 1, Length: 37854, dtype: int64

In [84]:
DataFrame(bins)


Out[84]:
0
0 0
1 1
2 2
3 4
4 8
5 16
6 32
7 64
8 128
9 256
10 512
11 1024
12 2048

In [94]:
ggplot(df, aes(x=df[1]))  + \
    scale_x_log10()+ \
    geom_density(alpha=1, kernel="optcosine", adjust = 0.1, color= "red")


Out[94]:
<ggplot: (314387005)>

In [1]:
%load_ext rmagic

In [7]:
import scipy
import pylab
import numpy
from numpy import *
import rpy2
from rpy2 import *
%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['pylab']
`%matplotlib` prevents importing * from pylab and numpy

In [8]:
import rpy2.robjects.lib.ggplot2 as ggplot2
import rpy2.robjects as ro
from rpy2.robjects.packages import importr

datasets = importr('datasets')
#mtcars = datasets.mtcars
mtcars = datasets.__rdata__.fetch('mtcars')['mtcars']

In [9]:
rnorm = stats.rnorm
dataf_rnorm = robjects.DataFrame({'value': rnorm(300, mean=0) + rnorm(100, mean=3),
                                  'other_value': rnorm(300, mean=0) + rnorm(100, mean=3),
                                  'mean': robjects.IntVector([0, ]*300 + [3, ] * 100)})


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-b9accc154e34> in <module>()
----> 1 rnorm = stats.rnorm
      2 dataf_rnorm = robjects.DataFrame({'value': rnorm(300, mean=0) + rnorm(100, mean=3),
      3                                   'other_value': rnorm(300, mean=0) + rnorm(100, mean=3),
      4                                   'mean': robjects.IntVector([0, ]*300 + [3, ] * 100)})

NameError: name 'stats' is not defined

In [10]:
gp = ggplot2.ggplot(mtcars)

In [11]:
pp = gp + \
     ggplot2.aes_string(x='wt', y='mpg') + \
     ggplot2.geom_point()

pp.plot()

In [ ]:





In [ ]:
featured_genome = SeqRecord(ecoligenome.seq)
from Bio import SeqUtils
from Bio.SeqUtils import *
import sys

for index, item in enumerate(bummerguidesseq):
    if len(item.seq) > 0:
        # setup toolbar
        #sys.stdout.flush() # These two lines produce the line counter as the loop runs
        #sys.stdout.write('\r' + str(index) + " "),
        featinfo = []
        fwd = nt_search(str(featured_genome.seq), str(item.seq))
        if len(fwd) > 1:
            featinfo.append((fwd, +1))
        rev = nt_search(str(featured_genome.seq), str(item.seq.reverse_complement()))
        if len(rev) > 1:
            featinfo.append((rev, -1))
        for f in featinfo:
            start = ExactPosition(f[0][1])
            end = ExactPosition(f[0][1] + len(item.seq))
            location = FeatureLocation(start, end)
            feature = SeqFeature(location,type=str("spacer"), strand = f[1])
            featured_genome.features.append(feature)     
# The output seq below is derived from the genomic search result, not the input

In [ ]:
SeqIO.write(featured_genome, "bummerguides.gb", "gb")
From 100000 input reads, 27453 correct-direction/20 min quality spacers. 25509 match perfectly to genome, or 92.918% From 100000 input reads, 9485 30-min quality spacers. 8769 or 92.451% map.

In [79]:
print float(len(featured_genome.features))/len(alltgts)
print len(featured_genome.features)


0.924512387981
8769

In [36]:
handle = open("../../Genomic Data/Escherichia coli str. K-12 substr. MG1655, complete genome.fasta", 'rb')
ecoligenome = SeqIO.parse(handle, "fasta", alphabet=IUPACAmbiguousDNA())
ecoligenomedict = {}
ecoligenomedict["gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome"] = next(ecoligenome)

In [37]:
%run al_funcs.ipynb

In [38]:
for index, item in enumerate(cutslist[68:69]):
    summary = al_scoreguide(item, "ecolik12", ecoligenomedict)
    score, details = summary
    print score
    print zip([b[1][2] for b in details], [b[1][1] for b in details])


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-38-d6aaa208c01c> in <module>()
      1 for index, item in enumerate(cutslist[68:69]):
      2     summary = al_scoreguide(item, "ecolik12", ecoligenomedict)
----> 3     score, details = summary
      4     print score
      5     print zip([b[1][2] for b in details], [b[1][1] for b in details])

ValueError: too many values to unpack

In [39]:
len(cutslist)


Out[39]:
92450

In [40]:
(item for item in cutslist[68:69])


Out[40]:
<generator object <genexpr> at 0x123d4ca50>

In [41]:
al_scoreguide(cutslist[68], "ecolik12", ecoligenomedict)


Out[41]:
(100,
 SeqRecord(seq=Seq('CGGGTAACAATC', IUPACAmbiguousDNA()), id='34_F', name='5534', description='HpaII', dbxrefs=['gi|545778205|gb|U00096.3|']),
 [[0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    2458718,
    u'CGGGTAACAATC')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    3198029,
    u'CGGGTAACAATC')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    686730,
    u'GGGTAACAATC')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    4467035,
    u'CGGGTAACAAT')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    16360,
    u'GGTAACAATC')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    58981,
    u'GGTAACAATC')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    151767,
    u'GGTAACAATC')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    361576,
    u'GGTAACAATC')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    608980,
    u'GGTAACAATC')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    643165,
    u'GGGTAACAAT')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    852563,
    u'CGGGTAACAA')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    960515,
    u'GGGTAACAAT')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    1125084,
    u'GGTAACAATC')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    1271505,
    u'GGTAACAATC')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    1351266,
    u'GGGTAACAAT')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    1791222,
    u'CGGGTAACAA')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    2455570,
    u'CGGGTAACAA')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    2502278,
    u'GGTAACAATC')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    2515246,
    u'GGTAACAATC')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    2532220,
    u'GGTAACAATC')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    2647128,
    u'GGTAACAATC')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    3347229,
    u'CGGGTAACAA')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    3905701,
    u'GGTAACAATC')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    4468638,
    u'GGTAACAATC')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    4515929,
    u'CGGGTAACAA')],
  [0.0,
   (u'gi|545778205|gb|U00096.3| Escherichia coli str. K-12 substr. MG1655, complete genome',
    4577856,
    u'GGTAACAATC')]])

In [42]:
cutslist[0]


Out[42]:
SeqRecord(seq=Seq('GGGACTCGCCGCCGCCCAGC', IUPACAmbiguousDNA()), id='0_F', name='564', description='HpaII', dbxrefs=['gi|545778205|gb|U00096.3|'])

In [ ]:
all_ecoli_scores = []
for index, item in enumerate(cutslist):
    if str(index)[-3:] == "000":
        print index
    all_ecoli_scores.append([al_scoreguide(item, "ecolik12", ecoligenomedict), item])

In [ ]:


In [47]:
pickle.dump(all_ecoli_scores, open( "HiSeqOut/Part 5 - functional mapping/all_ecoli_guidescores.p", "wb" ))

In [2]:
import cPickle as pickle

In [3]:
all_ecoli_scores = pickle.load(open( "HiSeqOut/Part 5 - functional mapping/all_ecoli_guidescores.p", "r" ))

In [4]:
import copy
a = []
for (score, details) in all_ecoli_scores:
    a.append(int(details.name))
resultssorted = zip([[r[0], b] for r, b in all_ecoli_scores], a)
from operator import itemgetter, attrgetter, methodcaller
resultssorted = sorted(resultssorted, key=itemgetter(0), reverse=True)
resultssorted = [item for item, bullshit in resultssorted]

In [21]:
set([score, str(details.seq) in resultssorted])


Out[21]:
[100,
 SeqRecord(seq=Seq('CTCGTTGCTCTAAAAGCCCC', IUPACAmbiguousDNA()), id='46227_R', name='4641552', description='ScrFI', dbxrefs=['gi|545778205|gb|U00096.3|'])]

In [31]:
# The set stuff here collapses redundant guides
scores = [score for score, details in set([(score, str(details.seq)) for score, details in resultssorted])]
import numpy
bins = range(0,106, 5)
% pylab inline
figure(figsize=(5, 4))
hist(scores, bins, color="gray")
tick_params(axis=u'both', labelsize=18)
(n, bins, patches) = plt.hist(scores, bins, color="gray")
savefig('HiSeqOut/All_Ecoli_guide_scores_redundancy_removed.pdf', format="pdf")
#gca().set_xscale("log")


Populating the interactive namespace from numpy and matplotlib

In [34]:
len(scores)


Out[34]:
80894

In [32]:
print zip(n, bins)


[(45.0, 0), (16.0, 5), (13.0, 10), (38.0, 15), (23.0, 20), (170.0, 25), (53.0, 30), (20.0, 35), (64.0, 40), (64.0, 45), (193.0, 50), (38.0, 55), (67.0, 60), (311.0, 65), (423.0, 70), (37.0, 75), (8.0, 80), (45.0, 85), (55.0, 90), (2136.0, 95), (77075.0, 100)]

In [23]:
import collections

In [33]:
collections.Counter(scores)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-33-54fb7fd039d9> in <module>()
----> 1 collections.Counter(scores)

NameError: name 'collections' is not defined

In [36]:
for number, count in collections.Counter(scores).items():
    print((number, count*100.0/92450.0))


(0, 0.05300162249864792)
(1, 0.19578150351541374)
(2, 0.015143320713899405)
(3, 0.011898323418063819)
(4, 0.005408328826392645)
(5, 0.0075716603569497025)
(6, 0.001081665765278529)
(7, 0.011898323418063819)
(8, 0.005408328826392645)
(9, 0.012979989183342347)
(10, 0.001081665765278529)
(11, 0.002163331530557058)
(12, 0.001081665765278529)
(13, 0.005408328826392645)
(14, 0.029204975662520283)
(15, 0.0746349378042185)
(16, 0.16982152514872903)
(17, 0.016224986479177934)
(19, 0.015143320713899405)
(20, 0.009734991887506761)
(21, 0.014061654948620876)
(22, 0.017306652244456464)
(23, 0.008653326122228232)
(24, 0.0411032990805841)
(25, 1.040562466197945)
(26, 0.006489994591671174)
(27, 0.015143320713899405)
(28, 0.19686316928069228)
(29, 0.006489994591671174)
(30, 0.012979989183342347)
(32, 0.014061654948620876)
(33, 0.20118983234180637)
(34, 0.004326663061114116)
(35, 0.001081665765278529)
(36, 0.002163331530557058)
(37, 0.006489994591671174)
(38, 0.008653326122228232)
(39, 0.0411032990805841)
(40, 0.1795565170362358)
(41, 0.006489994591671174)
(42, 0.0075716603569497025)
(43, 0.006489994591671174)
(44, 0.019469983775013522)
(45, 0.022714981070849107)
(46, 0.01838831800973499)
(47, 0.02704164413196322)
(48, 0.01838831800973499)
(49, 0.047593293672255274)
(50, 0.49864791779340184)
(51, 0.008653326122228232)
(52, 0.02163331530557058)
(53, 0.004326663061114116)
(54, 0.009734991887506761)
(55, 0.008653326122228232)
(56, 0.004326663061114116)
(57, 0.004326663061114116)
(58, 0.004326663061114116)
(59, 0.024878312601406164)
(60, 0.01081665765278529)
(61, 0.009734991887506761)
(62, 0.028123309897241752)
(63, 0.008653326122228232)
(64, 0.03461330448891293)
(65, 0.03461330448891293)
(66, 0.599242833964305)
(67, 0.023796646836127637)
(69, 0.006489994591671174)
(70, 0.4553812871822607)
(71, 0.001081665765278529)
(72, 0.011898323418063819)
(73, 0.001081665765278529)
(74, 0.002163331530557058)
(77, 0.008653326122228232)
(78, 0.028123309897241752)
(79, 0.003244997295835587)
(80, 0.002163331530557058)
(81, 0.001081665765278529)
(83, 0.003244997295835587)
(84, 0.002163331530557058)
(85, 0.017306652244456464)
(86, 0.004326663061114116)
(87, 0.028123309897241752)
(88, 0.002163331530557058)
(89, 0.002163331530557058)
(90, 0.008653326122228232)
(91, 0.003244997295835587)
(92, 0.005408328826392645)
(93, 0.019469983775013522)
(94, 0.02704164413196322)
(95, 0.09843158464034614)
(96, 0.0746349378042185)
(97, 0.17090319091400757)
(98, 0.32017306652244454)
(99, 1.7923201730665224)
(100, 92.92266089778259)

In [34]:
collections.Counter(scores).items()


Out[34]:
[(0, 49),
 (1, 181),
 (2, 14),
 (3, 11),
 (4, 5),
 (5, 7),
 (6, 1),
 (7, 11),
 (8, 5),
 (9, 12),
 (10, 1),
 (11, 2),
 (12, 1),
 (13, 5),
 (14, 27),
 (15, 69),
 (16, 157),
 (17, 15),
 (19, 14),
 (20, 9),
 (21, 13),
 (22, 16),
 (23, 8),
 (24, 38),
 (25, 962),
 (26, 6),
 (27, 14),
 (28, 182),
 (29, 6),
 (30, 12),
 (32, 13),
 (33, 186),
 (34, 4),
 (35, 1),
 (36, 2),
 (37, 6),
 (38, 8),
 (39, 38),
 (40, 166),
 (41, 6),
 (42, 7),
 (43, 6),
 (44, 18),
 (45, 21),
 (46, 17),
 (47, 25),
 (48, 17),
 (49, 44),
 (50, 461),
 (51, 8),
 (52, 20),
 (53, 4),
 (54, 9),
 (55, 8),
 (56, 4),
 (57, 4),
 (58, 4),
 (59, 23),
 (60, 10),
 (61, 9),
 (62, 26),
 (63, 8),
 (64, 32),
 (65, 32),
 (66, 554),
 (67, 22),
 (69, 6),
 (70, 421),
 (71, 1),
 (72, 11),
 (73, 1),
 (74, 2),
 (77, 8),
 (78, 26),
 (79, 3),
 (80, 2),
 (81, 1),
 (83, 3),
 (84, 2),
 (85, 16),
 (86, 4),
 (87, 26),
 (88, 2),
 (89, 2),
 (90, 8),
 (91, 3),
 (92, 5),
 (93, 18),
 (94, 25),
 (95, 91),
 (96, 69),
 (97, 158),
 (98, 296),
 (99, 1657),
 (100, 85907)]

In [33]:


In [ ]: